## Clear R-workspace
rm(list=ls(all=TRUE))

## Close all graphic devices
graphics.off()
#####################
### Load packages ###
#####################
library(pacman)

pacman::p_load(extrafont,DT,stringr,dplyr,plyr,tibble,tidyverse,data.table, ggplot2,strex,Hmisc,hrbrthemes,
               ggpubr,mlbench,Amelia,caret, DMwR, DataExplorer,FactoMineR,OutlierDetection,factoextra, parallel,doParallel,scales,precrec,
               RColorBrewer, pander, rlist,
               reconPlots,
               survival,
               survminer, strex, ggplotify,flextable,ggsci, naniar,viridis,cowplot,ggthemes,dichromat)

extrafont::loadfonts(device="win")
windowsFonts(sans="Palatino Linotype")
loadfonts(device="win")
loadfonts(device="postscript")
###########################
### Main analysis paths ###
###########################
# Main
scriptsPath <- paste0("scripts/")
scriptsFunctionsPath <- paste0(scriptsPath,"functions/")
projectDataPath <- paste0("data/")
# Input
tcgaInputData <- paste0(projectDataPath,"TCGA/")
antiPDL1publicInputData <- paste0(projectDataPath,"antiPD(L)1_public/")
otherInputData <- paste0(projectDataPath,"other/")
referenceInputData <- paste0(projectDataPath,"reference/")
# # Output
projectOutDataPath <- paste0("output/data_files/")
# tcgaIntermediateData <- paste0(projectOutDataPath,"TCGA/")
antiPDL1publicIntermediateData <- paste0(projectOutDataPath,"antiPD(L)1_public/")

  
figuresPath <- paste0("output/figures/")
# tablesPath <- paste0("output/tables/")
# modelsPath <- paste0("output/models/")

# # #Intermediate output
# testDataOut <-paste0(modelsPath,"test_data/")
# rocCompareOut <-paste0(modelsPath,"roc_compare/")
# modelsInterm <- paste0(modelsPath,"models_interm_files/")

# Session/dependencies
sessionInfoPath <- paste0("session_info/")
##########################
### FOLDER/FILES NAMES ###
##########################
# Create list with folders names for the different datasets.
datasets <- c("Hugo","Riaz","Gide","Liu","Rod","Miao","Immotion150","Kim","Mari", "Powles","Braun")
data.foldersList <- as.list(c("Hugo_MEL","Riaz_MEL","Gide_MEL", "Liu_MEL","Rodriguez_MEL","Miao_RCC","Immo150_RCC","Kim_GAS","Mari_BLA","Powles_BLA","Braun_RCC"))
names(data.foldersList)<- c("Hugo","Riaz","Gide", "Liu","Rod", "Miao","Immotion150","Kim","Mari","Powles","Braun")

##################################
### LOAD SOURCE FUNCTIONS FILE ###
##################################
source(paste0(scriptsFunctionsPath,"MissingData_functions.R"), local=T)


#####################
### File suffixes ###
#####################
Rdata.suffix <- ".RData"

# Other script variables
# analysis_type <- "_TCRaIGH_noLiu"
modelTypeA <- "_TCRArichnessIGH.ds"
modelTypeB <- "_noLIU"
modelTypeC <- "_mintcr4bcr10"

1 Loading Sample and Clone data

We are using the data that were TCRa min 4 clones, BCR igh min 10 clones

immdata.TcrBcr.full.red <- loadRData(paste0(antiPDL1publicIntermediateData,"immdata.TcrBcr.full.red",modelTypeA,modelTypeB,modelTypeC,".RData"))
immdata.TcrBcr.full.red$response <- ifelse(immdata.TcrBcr.full.red$response=="CR+PR","CRPR",immdata.TcrBcr.full.red$response)

2 Looking at available data

First filtering, to keep only CR, PR, PD samples

## FILTER DATA FOR PD, CR, PR (.sub)
immdata.TcrBcr.full.red.sub <- immdata.TcrBcr.full.red %>% filter(response %in% c("PD","CRPR")) %>% droplevels()

From 695 samples (that include all responses), we drop down to 519, that are CR, PR, PD pre-treatment samples.

Looking at how many of the data have TMB info

datatable(immdata.TcrBcr.full.red.sub %>% select(run_accession,response,tissue) %>% group_by(tissue,response) %>% dplyr::summarise(no_rows=length(response)), extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'excel', 'csv' ),
    scrollX=TRUE,
    pageLength=15
  ),
  caption = 'Number of responders/non-responders in all tissues'
)

Looking at how many of the data have TMB info

datatable(immdata.TcrBcr.full.red.sub %>% filter(complete.cases(TMB)) %>% select(run_accession,response,tissue) %>% group_by(tissue,response) %>% dplyr::summarise(no_rows=length(response)), extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'excel', 'csv' ),
    scrollX=TRUE,
    pageLength=15
  ),
  caption = 'Number of responders/non-responders in all tissues that have TMB'
)

From 519 CRPR, PD samples, we drop down to 373, that are CR, PR, PD pre-treatment samples. The only tissues we can actually use for the model are melanoma and bladder, that have more than 100 samples.

Prepare other variables and dataframes necessary for the analysis.

## EXTRACT RESPONSE, NOMINAL, MEASURE VARIABLES FROM DATA
response.var <- "response"
nominal.vars <- c("dataset","tissue","gender")

measure.vars <-colnames(immdata.TcrBcr.full.red.sub)[c(2:10,12,32:43)]
measure.vars <- measure.vars[c(6,22,1:3,20,21,4:5,10,7:9,11:19)]



## CHECK INFINITE LOG TMB VALUES
immdata.TcrBcr.full.red.sub[which(is.infinite(immdata.TcrBcr.full.red.sub$logTMB)),]
##  [1] run_accession      TuTACK_sigscore    TIS_sigscore       PDL1              
##  [5] CD8A               CD8B               TCRArichness.ds    StromalScore      
##  [9] ImmuneScore        ESTIMATEScore      title              age               
## [13] gender             disease_status     biopsy_site        primary_tumor     
## [17] treatment          prior_treatment    response           OS.time           
## [21] OS                 PFS.time           PFS                pre_on_treatment  
## [25] total_muts         nonsyn_muts        clonal_muts        subclonal_muts    
## [29] TMB                tissue             dataset            Bcells            
## [33] CD8cells           CD4cells           Tregs              NKcells           
## [37] DCcells            Monocytes          Granulocytes       Macrophages       
## [41] TumorPurity        logTMB             BCR.IGHrichness.ds
## <0 rows> (or 0-length row.names)
## Extract tissues in data so far
tissues <- unique(immdata.TcrBcr.full.red.sub$tissue)

# Keep samples with TMB (.tmb)
immdata.TcrBcr.full.red.sub.tmb <- immdata.TcrBcr.full.red.sub %>% filter(complete.cases(logTMB)) %>% droplevels()#;nrow(immdata.TcrBcr.full.red.sub.tmb)
# ONLY MELANOMA, WITH TMB (.tmb.MEL)
immdata.TcrBcr.full.red.sub.tmb.MEL <- immdata.TcrBcr.full.red.sub.tmb %>% filter(tissue=="melanoma") %>% droplevels()#;nrow(immdata.TcrBcr.full.red.sub.tmb.MEL)
## ALL MELANOMA AND BLADDER, WITH TMB (.tmb.ALL)
immdata.TcrBcr.full.red.sub.tmb.ALL <- immdata.TcrBcr.full.red.sub.tmb %>% filter(tissue %in% c("melanoma","bladder")) %>% droplevels()#;nrow(immdata.TcrBcr.full.red.sub.tmb.ALL)
# ALL DATA, MELANOMA AND BLADDER, BUT LEAVE MISSING TMB (.ALL)
immdata.TcrBcr.full.red.sub.ALL <- immdata.TcrBcr.full.red.sub %>% filter(tissue %in% c("melanoma","bladder")) %>% droplevels()#;nrow(immdata.TcrBcr.full.red.sub.ALL)

# Which are the covariates I am considering
covariatesIn <- measure.vars[c(1:2,4:7,14:16)]
responseVar <- "response"

## INSTEAD OF FILTERING FOR NO MISSING TMB, I DO THAT FOR ALL COVARIATES OF INTEREST
# Remove rows that have outliers/missing values since I get errors with Knn imputation
immdata.TcrBcr.full.red.sub.ALL.filt <- immdata.TcrBcr.full.red.sub.ALL %>% filter_at(vars(covariatesIn), all_vars(complete.cases(.))) %>% droplevels()
# DOING THE SAME FOR THE ORIGINAL DATA - USE THIS WHEN TESTING
# these data include other tissues apart from melanoma and bladder
immdata.TcrBcr.full.red.sub.filt <- immdata.TcrBcr.full.red.sub %>% filter_at(vars(covariatesIn), all_vars(complete.cases(.))) %>% droplevels()

# FINAL DATA FOR MODELLING
# Remove the Liu dataset for the model testing
immdata.TcrBcr.BLAD.MEL.ready <-immdata.TcrBcr.full.red.sub.ALL.filt %>% filter(dataset %notin% c("Liu") ) %>% droplevels() # All melanoma & bladder, PD, CR, PR, complete data
immdata.TcrBcr.ALL.ready <- immdata.TcrBcr.full.red.sub.filt %>% filter(dataset %notin% c("Liu") ) %>% droplevels() # All tissues, PD, CR, PR, complete data

## For gastric
immdata.TcrBcr.full.red.sub.filt.gastric <- immdata.TcrBcr.full.red.sub %>% filter(tissue %in% c("gastric") ) %>%  
                                            filter_at(vars(covariatesIn[-6]), all_vars(complete.cases(.))) %>% droplevels()

3 Missing data

3.1 All together

## covariatesIn,"OS","OS.time","age","gender","disease_status"
factors <- c(covariatesIn,"OS","OS.time","age","gender","disease_status","tissue","dataset")#,"prior_treatment"
df.full <- immdata.TcrBcr.full.red %>% select(all_of(factors)) %>% droplevels()

df.full$gender <- ifelse(df.full$gender=="",NA,df.full$gender)
df.full$disease_status <- ifelse(df.full$disease_status=="",NA,
                                 ifelse(df.full$disease_status=="Unknown",NA,df.full$disease_status))


# Are there missing values in the dataset?
any_na(df.full)

[1] TRUE

# How many?
n_miss(df.full)

[1] 1215

prop_miss(df.full)

[1] 0.1092626

# Which variables are affected?
df.full %>% is.na() %>% colSums() %>% pander()
Table continues below
TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 TumorPurity
0 0 0 0 0
Table continues below
logTMB Bcells CD8cells CD4cells OS OS.time age gender
198 0 0 0 128 130 496 87
disease_status tissue dataset
176 0 0
datatable(miss_var_summary(df.full), extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'excel', 'csv' ),
    scrollX=TRUE,
    pageLength=15
  ),
  caption = 'number of missings per variable'
)
datatable(miss_var_table(df.full), extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'excel', 'csv' ),
    scrollX=TRUE,
    pageLength=15
  ),
  caption = 'Tabulate the missings in the variables'
)
datatable(miss_case_summary(df.full), extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'excel', 'csv' ),
    scrollX=TRUE,
    pageLength=15
  ),
  caption = 'number of missings per participant (n and %)'
)
datatable(miss_case_table(df.full), extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'excel', 'csv' ),
    scrollX=TRUE,
    pageLength=15
  ),
  caption = 'number of missings per participant (n and %)'
)
# Get number of missings per participant (n and %)
gg_miss_var(df.full)

vis_miss(df.full) + theme(axis.text.x = element_text(angle=80))

gg_miss_upset(df.full)

3.2 Missing in histologies

3.2.1 Melanoma

3.2.1.1 ALL

# Which variables are affected?
df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels() %>% is.na() %>% colSums() %>% pander()
Table continues below
TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 TumorPurity
0 0 0 0 0
Table continues below
logTMB Bcells CD8cells CD4cells OS OS.time age gender
68 0 0 0 0 2 92 3
disease_status tissue dataset
44 0 0
# Get number of missings per participant (n and %)
gg_miss_var(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels())

vis_miss(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels()) + theme(axis.text.x = element_text(angle=80))

gg_miss_upset(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels())

3.2.1.2 Datasets

3.2.1.2.1 Boxplots
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="disease_status")), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="gender")), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="age")) + scale_color_viridis(alpha=0.6), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="prior_treatment")), simplify = FALSE)
3.2.1.2.2 Histograms
3.2.1.2.3 Other
df.full.mel <- df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels() %>% group_split(dataset)
# Which variables are affected?

df.full.mel %>% map(is.na) %>% map(colSums)

[[1]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 38 0 0 CD4cells OS OS.time age 0 0 0 0 gender disease_status tissue dataset 0 38 0 0

[[2]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 0 0 0 CD4cells OS OS.time age 0 0 1 0 gender disease_status tissue dataset 0 0 0 0

[[3]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 0 0 0 CD4cells OS OS.time age 0 0 0 89 gender disease_status tissue dataset 0 0 0 0

[[4]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 3 0 0 CD4cells OS OS.time age 0 0 0 3 gender disease_status tissue dataset 3 6 0 0

[[5]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 27 0 0 CD4cells OS OS.time age 0 0 1 0 gender disease_status tissue dataset 0 0 0 0

df.full.mel %>% map(gg_miss_var)

[[1]] [[2]] [[3]] [[4]] [[5]]

df.full.mel %>% map(vis_miss) #%>% map(theme(axis.text.x = element_text(angle=80)))

[[1]] [[2]] [[3]] [[4]] [[5]]

#df.full.mel %>% map(gg_miss_upset)

3.2.2 Bladder

3.2.2.1 ALL

# Which variables are affected?
df.full  %>% dplyr::filter(tissue=="bladder") %>% droplevels() %>% is.na() %>% colSums()  %>% pander()
Table continues below
TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 TumorPurity
0 0 0 0 0
Table continues below
logTMB Bcells CD8cells CD4cells OS OS.time age gender
58 0 0 0 68 68 344 68
disease_status tissue dataset
98 0 0
# Get number of missings per participant (n and %)
gg_miss_var(df.full  %>% dplyr::filter(tissue=="bladder") %>% droplevels())

vis_miss(df.full  %>% dplyr::filter(tissue=="bladder") %>% droplevels()) + theme(axis.text.x = element_text(angle=80))

gg_miss_upset(df.full  %>% dplyr::filter(tissue=="bladder") %>% droplevels())

3.2.2.2 Datasets

3.2.2.2.1 Boxplots
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="bladder") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot(), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="bladder") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="gender")), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="bladder") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="age")) + scale_color_viridis(alpha=0.6), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

3.2.2.2.2 Histograms
3.2.2.2.3 Other
df.full.blad <- df.full  %>% dplyr::filter(tissue=="bladder") %>% droplevels() %>% group_split(dataset)
# Which variables are affected?


df.full.blad %>% map(is.na) %>% map(colSums) 

[[1]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 57 0 0 CD4cells OS OS.time age 0 0 0 276 gender disease_status tissue dataset 0 30 0 0

[[2]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 1 0 0 CD4cells OS OS.time age 0 68 68 68 gender disease_status tissue dataset 68 68 0 0

df.full.blad %>% map(gg_miss_var)

[[1]] [[2]]

df.full.blad %>% map(vis_miss) #%>% map(theme(axis.text.x = element_text(angle=80)))

[[1]] [[2]]

df.full.blad %>% map(gg_miss_upset)

[[1]] [[2]]

3.2.3 RCC

3.2.3.1 ALL

# Which variables are affected?
df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels() %>% is.na() %>% colSums()  %>% pander()
Table continues below
TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 TumorPurity
0 0 0 0 0
Table continues below
logTMB Bcells CD8cells CD4cells OS OS.time age gender
29 0 0 0 60 60 60 16
disease_status tissue dataset
34 0 0
# Get number of missings per participant (n and %)
gg_miss_var(df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels())

vis_miss(df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels()) + theme(axis.text.x = element_text(angle=80))

gg_miss_upset(df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels())

3.2.3.2 Datasets

3.2.3.2.1 Boxplots
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot(), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="gender")), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="age")) + scale_color_viridis(alpha=0.6), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

3.2.3.2.2 Histograms
3.2.3.2.3 Other
df.full.rcc <- df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels() %>% group_split(dataset)
# Which variables are affected?


df.full.rcc %>% map(is.na) %>% map(colSums) 

[[1]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 16 0 0 CD4cells OS OS.time age 0 60 60 60 gender disease_status tissue dataset 16 6 0 0

[[2]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 13 0 0 CD4cells OS OS.time age 0 0 0 0 gender disease_status tissue dataset 0 28 0 0

df.full.rcc %>% map(gg_miss_var)

[[1]] [[2]]

df.full.rcc %>% map(vis_miss) #%>% map(theme(axis.text.x = element_text(angle=80)))

[[1]] [[2]]

df.full.rcc %>% map(gg_miss_upset)

[[1]] [[2]]

3.2.4 Gastric

3.2.4.1 Boxplots

sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="gastric") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot(), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="gastric") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="gender")), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

sapply(colnames(df.full)[1:9], function(x) ggplot(df.full  %>% dplyr::filter(tissue=="gastric") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="age")) + scale_color_viridis(alpha=0.6), simplify = FALSE)

$TCRArichness.ds $BCR.IGHrichness.ds $TIS_sigscore $PDL1 $TumorPurity $logTMB $Bcells $CD8cells $CD4cells

3.2.4.2 Histograms

3.2.4.3 Other

# Which variables are affected?
df.full  %>% dplyr::filter(tissue=="gastric") %>% droplevels() %>% is.na() %>% colSums()  %>% pander()
Table continues below
TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 TumorPurity
0 0 0 0 0
Table continues below
logTMB Bcells CD8cells CD4cells OS OS.time age gender
43 0 0 0 0 0 0 0
disease_status tissue dataset
0 0 0
# Get number of missings per participant (n and %)
gg_miss_var(df.full  %>% dplyr::filter(tissue=="gastric") %>% droplevels())

vis_miss(df.full  %>% dplyr::filter(tissue=="gastric") %>% droplevels()) + theme(axis.text.x = element_text(angle=80))

#gg_miss_upset(df.full  %>% dplyr::filter(tissue=="gastric") %>% droplevels())

4 Melanoma & age/gender/disease status

ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = age)) + geom_boxplot()

# ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()

df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels() %>%
  group_by(dataset) %>%
  dplyr::count(gender) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = dataset, y = prop, fill = gender)) +
    geom_bar(stat = "identity") #+

    #coord_flip() 
# ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()

df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels() %>%
  group_by(dataset) %>%
  dplyr::count(disease_status) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = dataset, y = prop, fill = disease_status)) +
    geom_bar(stat = "identity") #+

    #coord_flip() 

5 Bladder & age/gender/disease status

# ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()

df.full  %>% dplyr::filter(tissue=="bladder") %>% droplevels() %>%
  group_by(dataset) %>%
  dplyr::count(gender) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = dataset, y = prop, fill = gender)) +
    geom_bar(stat = "identity") #+

    #coord_flip() 
# ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()

df.full  %>% dplyr::filter(tissue=="bladder") %>% droplevels() %>%
  group_by(dataset) %>%
  dplyr::count(disease_status) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = dataset, y = prop, fill = disease_status)) +
    geom_bar(stat = "identity") #+

    #coord_flip() 

6 RCC & age/gender/disease status

ggplot(df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels(), aes(x = factor(dataset), y = age)) + geom_boxplot()

# ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()

df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels() %>%
  group_by(dataset) %>%
  dplyr::count(gender) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = dataset, y = prop, fill = gender)) +
    geom_bar(stat = "identity") #+

    #coord_flip() 
# ggplot(df.full  %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()

df.full  %>% dplyr::filter(tissue=="RCC") %>% droplevels() %>%
  group_by(dataset) %>%
  dplyr::count(disease_status) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = dataset, y = prop, fill = disease_status)) +
    geom_bar(stat = "identity") #+

    #coord_flip() 

7 Calculate Number of events for each histology, numbe rof covariates etc.

8 A tibble: 4 x 1

tissue

1 bladder 2 gastric 3 melanoma 4 RCC

events <-sapply(test, function(x)  x %>% dplyr::count(OS) %>% filter(OS==1) %>% pull(n))
names(events) <-group_keys(ir) %>% pull(tissue)
pander(events)
bladder gastric melanoma RCC
180 23 118 19

Now to get a number on how many events per covariate:

# Divide by the minimum of covariates
pander(events/9)
bladder gastric melanoma RCC
20 2.556 13.11 2.111

Check also how many events and cencored

df.full %>%
  group_by(tissue) %>%
  dplyr::count(OS) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = tissue, y = prop, fill = OS)) +
    geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle=45))

df.full %>%
  group_by(dataset) %>%
  dplyr::count(OS) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = dataset, y = prop, fill = OS)) +
    geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle=45))

9 Imputation (can we?)

Even if we find that MAR is covered (per histology), we still have a problem imputing missing values across different datasets that make up the tissue cohorts. We are using multicenter data, data coming from different trials and the bias can be much more from what we see.

What is missing from where:

9.1 Melanoma


melMis <-ggarrange(plotlist=misPlots, common.legend = TRUE,legend="none", labels=c("Riaz et al.","Hugo et al.","Gide et al.","Liu et al.","Rodriguez et al."),font.label = list(size = 15, color = "black", face = "bold", family = "Palatino Linotype")
)
melMis

  • logTMB is missing completely from Gide and Rodriguez datasets
  • Some OS.time data missing from Hugo
  • age is missing completely from Liu
  • Riaz is missing some TMB, age and gender data
  • Rodriguez is missing also one OS.time

9.2 Bladder

bladMis <- ggarrange(plotlist=misPlots, common.legend = TRUE,legend="none", labels=c("Mariathasan et al.","Powles et al."),font.label = list(size = 15, color = "black", face = "bold", family = "Palatino Linotype"))
bladMis

  • Powles does not have OS data at all-NOT USED
  • Powles does not have gender data either
  • Age is missing completely from the bladder data
  • Mariathasan is missing some TMB data

9.3 RCC

rccMis <-ggarrange(plotlist=misPlots, common.legend = TRUE,legend="none", labels=c("McDermott et al.","Miao et al."),font.label = list(size = 15, color = "black", face = "bold", family = "Palatino Linotype"))
rccMis

  • Immotion does not have OS at all-NOT USED
  • Miao is missing some TMB data (around half)

ONE DATASET USED FOR SURVIVAL

9.4 Gastric

gasMis<-ggarrange(plotlist=misPlots, common.legend = TRUE,legend="bottom", labels=c("Kim et al."),font.label = list(size = 15, color = "black", face = "bold", family = "Palatino Linotype"))
gasMis

  • Gastric does not have TMB data at all

  • Conclusion

Imputation is no worth it and not consistently possible for our analyses.

10 Supplemental Figure 2

# library(cowplot)
p <- ggdraw() + 
          draw_plot(melMis,x = 0, y = 0.6, width = .98, height = .38) +
          draw_plot(rccMis,x = 0, y = 0.41, width = .68, height = .18) + 
          draw_plot(bladMis,x = 0, y = 0.22, width = .8, height =.18)+
          draw_plot(gasMis,x = 0, y = 0, width = .333, height = .22)+
          draw_plot_label(label = c("A: SKCM anti-PD-1/L1", 
                     "B: KIRC anti-PD-1/L1", 
                     "C: BLCA anti-PD-1/L1",
                     "D: STAD anti-PD-1/L1"), size = 16,
                     x = c(-0.04,-0.038,-0.038,-0.038), y = c(.99,.6,.41,.23),family="Palatino Linotype")
p

cairo_pdf(filename = paste0(figuresPath,"Supplemental Figure S2",".pdf"),family = "Palatino Linotype", width = 22, height=18)
  print(p)
dev.off()

png 2

# ggsave(p, filename = paste0(figuresPath,"Supplemental Figure S2.tiff"), dpi = 600,width = 22,
#        height = 22, units = "in")

11 Session

session_info <- sessionInfo()
writeLines(capture.output(session_info), paste0(sessionInfoPath,"E_07_MissingData.txt"))

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] parallel  grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dichromat_2.0-0.1      ggthemes_4.2.4         cowplot_1.1.1         
##  [4] viridis_0.6.2          viridisLite_0.4.1      naniar_0.6.1          
##  [7] ggsci_2.9              flextable_0.7.0        ggplotify_0.1.0       
## [10] survminer_0.4.9        reconPlots_0.1.0       rlist_0.4.6.2         
## [13] pander_0.6.5           RColorBrewer_1.1-3     precrec_0.12.9        
## [16] scales_1.2.0           doParallel_1.0.17      iterators_1.0.14      
## [19] foreach_1.5.2          factoextra_1.0.7       OutlierDetection_0.1.1
## [22] FactoMineR_2.4         DataExplorer_0.8.2     DMwR_0.1.0            
## [25] cluster_2.1.2          abind_1.4-5            rpart_4.1.16          
## [28] class_7.3-20           ROCR_1.0-11            quantmod_0.4.20       
## [31] TTR_0.24.3             xts_0.12.1             zoo_1.8-10            
## [34] caret_6.0-92           Amelia_1.8.0           Rcpp_1.0.9            
## [37] mlbench_2.1-3          ggpubr_0.4.0           hrbrthemes_0.8.0      
## [40] Hmisc_4.7-0            Formula_1.2-4          survival_3.3-1        
## [43] lattice_0.20-45        strex_1.4.2            data.table_1.14.2     
## [46] forcats_0.5.1          purrr_0.3.4            readr_2.1.2           
## [49] tidyr_1.2.0            ggplot2_3.3.6          tidyverse_1.3.1       
## [52] tibble_3.1.7           plyr_1.8.7             dplyr_1.0.9           
## [55] stringr_1.4.0          DT_0.22                extrafont_0.18        
## [58] pacman_0.5.1          
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2            tidyselect_1.1.2      htmlwidgets_1.5.4    
##   [4] pROC_1.18.0           munsell_0.5.0         codetools_0.2-18     
##   [7] interp_1.1-3          future_1.26.1         withr_2.5.0          
##  [10] spatstat.random_2.2-0 colorspace_2.0-3      highr_0.9            
##  [13] uuid_1.1-0            knitr_1.41            rstudioapi_0.13      
##  [16] leaps_3.1             stats4_4.1.0          officer_0.4.4        
##  [19] ggsignif_0.6.3        tensor_1.5            Rttf2pt1_1.3.10      
##  [22] listenv_0.8.0         labeling_0.4.2        KMsurv_0.1-5         
##  [25] mnormt_2.0.2          polyclip_1.10-0       farver_2.1.1         
##  [28] depth_2.1-1.1         parallelly_1.32.1     vctrs_0.4.1          
##  [31] generics_0.1.3        circular_0.4-95       ipred_0.9-13         
##  [34] xfun_0.35             R6_2.5.1              gridGraphics_0.5-1   
##  [37] spatstat.utils_2.3-1  assertthat_0.2.1      networkD3_0.4        
##  [40] nnet_7.3-17           gtable_0.3.1          globals_0.15.1       
##  [43] goftest_1.2-3         timeDate_4021.104     rlang_1.1.3          
##  [46] systemfonts_1.0.4     scatterplot3d_0.3-41  splines_4.1.0        
##  [49] rstatix_0.7.0         extrafontdb_1.0       lazyeval_0.2.2       
##  [52] ModelMetrics_1.2.2.2  spatstat.geom_2.4-0   broom_1.0.1          
##  [55] checkmate_2.1.0       rgl_0.108.3           yaml_2.3.6           
##  [58] reshape2_1.4.4        modelr_0.1.8          crosstalk_1.2.0      
##  [61] backports_1.4.1       tools_4.1.0           lava_1.6.10          
##  [64] ellipsis_0.3.2        spatstat.core_2.4-2   jquerylib_0.1.4      
##  [67] base64enc_0.1-3       deldir_1.0-6          haven_2.5.1          
##  [70] ggrepel_0.9.1         fs_1.5.2              magrittr_2.0.3       
##  [73] reprex_2.0.1          RANN_2.6.1            tmvnsim_1.0-2        
##  [76] mvtnorm_1.1-3         xtable_1.8-4          hms_1.1.2            
##  [79] evaluate_0.18         jpeg_0.1-9            readxl_1.4.0         
##  [82] gridExtra_2.3         compiler_4.1.0        crayon_1.5.2         
##  [85] htmltools_0.5.3       mgcv_1.8-40           tzdb_0.3.0           
##  [88] visdat_0.5.3          lubridate_1.8.0       DBI_1.1.2            
##  [91] dbplyr_2.1.1          MASS_7.3-57           boot_1.3-28          
##  [94] Matrix_1.4-0          car_3.1-1             cli_3.6.2            
##  [97] gower_1.0.0           igraph_1.3.1          km.ci_0.5-6          
## [100] pkgconfig_2.0.3       flashClust_1.01-2     foreign_0.8-82       
## [103] plotly_4.10.0         spatstat.sparse_2.1-1 recipes_1.0.1        
## [106] xml2_1.3.3            ldbod_0.1.2           bslib_0.3.1          
## [109] hardhat_1.2.0         prodlim_2019.11.13    rvest_1.0.2          
## [112] yulab.utils_0.0.4     digest_0.6.29         spatstat.data_2.2-0  
## [115] rmarkdown_2.14        cellranger_1.1.0      survMisc_0.5.6       
## [118] htmlTable_2.4.1       gdtools_0.2.4         curl_5.2.0           
## [121] lifecycle_1.0.1       nlme_3.1-157          jsonlite_1.8.3       
## [124] carData_3.0-5         fansi_1.0.3           pillar_1.8.0         
## [127] spatstat.linnet_2.3-2 fastmap_1.1.0         httr_1.4.7           
## [130] glue_1.6.2            UpSetR_1.4.0          zip_2.2.0            
## [133] spatstat_2.3-4        png_0.1-7             stringi_1.7.8        
## [136] sass_0.4.1            latticeExtra_0.6-30   future.apply_1.9.0